{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-24T14:17:00.215931Z",
     "start_time": "2024-09-24T14:16:55.072605Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/numpy/core/getlimits.py:499: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
      "  setattr(self, word, getattr(machar, word).flat[0])\n",
      "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float64'> type is zero.\n",
      "  return self._float_to_str(self.smallest_subnormal)\n",
      "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/numpy/core/getlimits.py:499: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
      "  setattr(self, word, getattr(machar, word).flat[0])\n",
      "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/numpy/core/getlimits.py:89: UserWarning: The value of the smallest subnormal for <class 'numpy.float32'> type is zero.\n",
      "  return self._float_to_str(self.smallest_subnormal)\n",
      "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
      "  from .autonotebook import tqdm as notebook_tqdm\n",
      "Building prefix dict from the default dictionary ...\n",
      "Loading model from cache /tmp/jieba.cache\n",
      "Loading model cost 1.294 seconds.\n",
      "Prefix dict has been built successfully.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "os.environ['HF_ENDPOINT'] = 'https://hf-mirror.com'\n",
    "os.environ[\"KMP_DUPLICATE_LIB_OK\"]=\"TRUE\"\n",
    "import mindspore\n",
    "from tqdm import tqdm\n",
    "from mindnlp.transformers import BartTokenizer, AutoModelForSeq2SeqLM\n",
    "import numpy as np\n",
    "\n",
    "from mindnlp.peft import AdaLoraConfig, PeftConfig, PeftModel, TaskType, get_peft_model\n",
    "from mindspore.dataset import GeneratorDataset\n",
    "\n",
    "if \"RANK_TABLE_FILE\" in os.environ:\n",
    "    del os.environ[\"RANL_TABLE_FILE\"]\n",
    "\n",
    "\n",
    "# os.environ[\"TOKENIZERS_PARALLELISM\"] = \"false\"\n",
    "model_name_or_path = \"facebook/bart-base\"\n",
    "checkpoint_name = \"financial_sentiment_analysis_lora_v1.pt\"\n",
    "text_column = \"sentence\"\n",
    "label_column = \"text_label\"\n",
    "max_length = 128\n",
    "lr = 1e-3\n",
    "num_epochs = 8\n",
    "batch_size = 8\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-24T14:17:01.751853Z",
     "start_time": "2024-09-24T14:17:01.739727Z"
    }
   },
   "outputs": [],
   "source": [
    "# creating model\n",
    "peft_config = AdaLoraConfig(\n",
    "    init_r=12,\n",
    "    target_r=8,\n",
    "    beta1=0.85,\n",
    "    beta2=0.85,\n",
    "    tinit=200,\n",
    "    tfinal=1000,\n",
    "    deltaT=10,\n",
    "    lora_alpha=32,\n",
    "    lora_dropout=0.1,\n",
    "    task_type=TaskType.SEQ_2_SEQ_LM,\n",
    "    inference_mode=False,\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "ExecuteTime": {
     "end_time": "2024-09-24T14:17:10.395696Z",
     "start_time": "2024-09-24T14:17:06.794849Z"
    }
   },
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "BartForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`.`PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.\n",
      "  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).\n",
      "  - If you are not the owner of the model architecture class, please contact the model code owner to update it.\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[MS_ALLOC_CONF]Runtime config:  enable_vmm:True  vmm_align_size:2MB\n",
      "trainable params: 2,434,176 || all params: 141,854,688 || trainable%: 1.715964438200308\n"
     ]
    }
   ],
   "source": [
    "model = AutoModelForSeq2SeqLM.from_pretrained(model_name_or_path)\n",
    "model = get_peft_model(model, peft_config)\n",
    "model.print_trainable_parameters()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "# download dataset\n",
    "from mindnlp.dataset import load_dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "[WARNING] ME(13021:281473221177520,MainProcess):2024-10-17-20:25:53.512.70 [mindspore/dataset/engine/datasets.py:1217] Dataset is shuffled before split.\n"
     ]
    }
   ],
   "source": [
    "#加载数据集\n",
    "from mindnlp.transformers import AutoTokenizer\n",
    "\n",
    "dataset = load_dataset(\"financial_phrasebank\", \"sentences_allagree\")\n",
    "train_dataset, validation_dataset = dataset.shuffle(64).split([0.9, 0.1])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['negative', 'neutral', 'positive']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "#添加文本标签\n",
    "classes = dataset.source.ds.features[\"label\"].names\n",
    "def add_text_label(sentence, label):\n",
    "    return sentence, label, classes[label.item()]\n",
    "\n",
    "train_dataset = train_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])\n",
    "validation_dataset = validation_dataset.map(add_text_label, ['sentence', 'label'], ['sentence', 'label', 'text_label'])\n",
    "classes"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/ma-user/anaconda3/envs/MindSpore/lib/python3.9/site-packages/mindnlp/transformers/tokenization_utils_base.py:1526: FutureWarning: `clean_up_tokenization_spaces` was not set. It will be set to `True` by default. This behavior will be depracted, and will be then set to `False` by default. \n",
      "  warnings.warn(\n"
     ]
    }
   ],
   "source": [
    "#分词化\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_name_or_path)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np\n",
    "from mindnlp.dataset import BaseMapFunction\n",
    "from threading import Lock\n",
    "lock = Lock()\n",
    "class MapFunc(BaseMapFunction):\n",
    "    def __call__(self, sentence, label, text_label):\n",
    "        lock.acquire()\n",
    "        model_inputs = tokenizer(sentence, max_length=max_length, padding=\"max_length\", truncation=True)\n",
    "        labels = tokenizer(text_label, max_length=3, padding=\"max_length\", truncation=True)\n",
    "        lock.release()\n",
    "        labels = labels['input_ids']\n",
    "        labels = np.where(np.equal(labels, tokenizer.pad_token_id), -100, labels)\n",
    "        return model_inputs['input_ids'], model_inputs['attention_mask'], labels"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "\n",
    "def get_dataset(dataset, tokenizer, batch_size=None, shuffle=True):\n",
    "    input_colums=['sentence', 'label', 'text_label']\n",
    "    output_columns=['input_ids', 'attention_mask', 'labels']\n",
    "    dataset = dataset.map(MapFunc(input_colums, output_columns),\n",
    "                          input_colums, output_columns)\n",
    "    if shuffle:\n",
    "        dataset = dataset.shuffle(64)\n",
    "    if batch_size:\n",
    "        dataset = dataset.batch(batch_size)\n",
    "    return dataset\n",
    "\n",
    "train_dataset = get_dataset(train_dataset, tokenizer,batch_size=batch_size)\n",
    "eval_dataset = get_dataset(validation_dataset, tokenizer, batch_size=batch_size,shuffle=False)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mindnlp.common.optimization import get_linear_schedule_with_warmup\n",
    "from mindnlp.core import optim\n",
    "# Setting up optimizer and learning rate scheduler\n",
    "optimizer = optim.AdamW(model.trainable_params(), lr=1e-3)\n",
    "lr_scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=(len(train_dataset) * num_epochs))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "model.base_model.peft_config[\"default\"].total_step = len(train_dataset) * num_epochs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "  0%|          | 0/255 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "-\r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [03:15<00:00,  1.31it/s]\n",
      "  3%|▎         | 1/29 [00:01<00:39,  1.41s/it]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "\\\r"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 29/29 [00:07<00:00,  3.84it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=0: train_ppl=Tensor(shape=[], dtype=Float32, value= 3.03945) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 1.11168) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.1222) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.115295)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [02:44<00:00,  1.55it/s]\n",
      "100%|██████████| 29/29 [00:05<00:00,  5.02it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=1: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.14435) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.134838) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.04685) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0457841)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [02:52<00:00,  1.48it/s]\n",
      "100%|██████████| 29/29 [00:05<00:00,  4.88it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=2: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.08269) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0794451) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.02456) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0242597)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [02:50<00:00,  1.50it/s]\n",
      "100%|██████████| 29/29 [00:05<00:00,  5.15it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=3: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.0693) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0670007) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.01534) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0152213)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [02:50<00:00,  1.50it/s]\n",
      "100%|██████████| 29/29 [00:07<00:00,  4.05it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=4: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.05028) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0490523) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.00823) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.00819443)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [02:59<00:00,  1.42it/s]\n",
      "100%|██████████| 29/29 [00:06<00:00,  4.69it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=5: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.05887) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0572002) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.01062) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0105656)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [02:46<00:00,  1.53it/s]\n",
      "100%|██████████| 29/29 [00:05<00:00,  5.04it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=6: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.033) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0324655) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.01462) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0145114)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "100%|██████████| 255/255 [02:47<00:00,  1.52it/s]\n",
      "100%|██████████| 29/29 [00:05<00:00,  5.27it/s]"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch=7: train_ppl=Tensor(shape=[], dtype=Float32, value= 1.02977) train_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.0293341) eval_ppl=Tensor(shape=[], dtype=Float32, value= 1.00767) eval_epoch_loss=Tensor(shape=[], dtype=Float32, value= 0.00763729)\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "\n"
     ]
    }
   ],
   "source": [
    "from mindnlp.core import value_and_grad\n",
    "# Forward function to compute the loss\n",
    "def forward_fn(**batch):\n",
    "    outputs = model(**batch)\n",
    "    loss = outputs.loss\n",
    "    return loss\n",
    "\n",
    "# Gradient function to compute gradients for optimization\n",
    "grad_fn = value_and_grad(forward_fn, model.trainable_params(), attach_grads=True)\n",
    "\n",
    "from mindspore import ops\n",
    "global_step = 0\n",
    "for epoch in range(num_epochs):\n",
    "    model.set_train(True)\n",
    "    total_loss = 0\n",
    "    train_total_size = train_dataset.get_dataset_size()\n",
    "    # Iterate over each entry in the training dataset\n",
    "    for step, batch in enumerate(tqdm(train_dataset.create_dict_iterator(), total=train_total_size)):  \n",
    "        optimizer.zero_grad()\n",
    "        loss = grad_fn(**batch)\n",
    "        optimizer.step()\n",
    "        total_loss += loss.float()  # Accumulate loss for monitoring\n",
    "        lr_scheduler.step()  # Update learning rate based on scheduler\n",
    "        # model.base_model.update_and_allocate(global_step,grads)\n",
    "        global_step += 1\n",
    "    model.set_train(False)\n",
    "    eval_loss = 0\n",
    "    eval_preds = []\n",
    "    eval_total_size = eval_dataset.get_dataset_size()\n",
    "    # Iterate over each entry in the evaluation dataset\n",
    "    for step, batch in enumerate(tqdm(eval_dataset.create_dict_iterator(), total=eval_total_size)): \n",
    "        with mindspore._no_grad():\n",
    "            outputs = model(**batch)\n",
    "        loss = outputs.loss\n",
    "        eval_loss += loss.float()\n",
    "        eval_preds.extend(\n",
    "            tokenizer.batch_decode(ops.argmax(outputs.logits, -1).asnumpy(), skip_special_tokens=True)\n",
    "        )\n",
    "    eval_epoch_loss = eval_loss / len(eval_dataset)\n",
    "    eval_ppl = ops.exp(eval_epoch_loss) # Perplexity\n",
    "    train_epoch_loss = total_loss / len(train_dataset)\n",
    "    train_ppl = ops.exp(train_epoch_loss) # Perplexity\n",
    "    print(f\"{epoch=}: {train_ppl=} {train_epoch_loss=} {eval_ppl=} {eval_epoch_loss=}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "accuracy=42.92035398230089 % on the evaluation dataset\n",
      "eval_preds[:10]=['neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'neutral', 'positive', 'neutral']\n",
      "ground_truth[:10]=['neutral', 'negative', 'positive', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'neutral', 'positive']\n"
     ]
    }
   ],
   "source": [
    "# Initialize counters for correct predictions and total predictions\n",
    "correct = 0\n",
    "total = 0\n",
    "\n",
    "# List to store actual labels for comparison\n",
    "ground_truth = []\n",
    "\n",
    "# Compare each predicted label with the true label\n",
    "for pred, data in zip(eval_preds, validation_dataset.create_dict_iterator(output_numpy=True)):\n",
    "    true = str(data['text_label'])\n",
    "    ground_truth.append(true)\n",
    "    if pred.strip() == true.strip():\n",
    "        correct += 1\n",
    "    total += 1\n",
    "\n",
    "# Calculate the percentage of correct predictions\n",
    "accuracy = correct / total * 100\n",
    "\n",
    "# Output the accuracy and sample predictions for review\n",
    "print(f\"{accuracy=} % on the evaluation dataset\")\n",
    "print(f\"{eval_preds[:10]=}\")\n",
    "print(f\"{ground_truth[:10]=}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [],
   "source": [
    "# Save the model\n",
    "peft_model_id = f\"../../output/{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
    "model.save_pretrained(peft_model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "from mindnlp.transformers import AutoModelForSeq2SeqLM\n",
    "from mindnlp.peft import PeftModel, PeftConfig\n",
    "\n",
    "peft_model_id = f\"../../output/{model_name_or_path}_{peft_config.peft_type}_{peft_config.task_type}\"\n",
    "\n",
    "# Load the model configuration\n",
    "config = PeftConfig.from_pretrained(peft_model_id)\n",
    "\n",
    "# Load the model\n",
    "model = AutoModelForSeq2SeqLM.from_pretrained(config.base_model_name_or_path)\n",
    "\n",
    "# Load the pretrained adapter\n",
    "model = PeftModel.from_pretrained(model, peft_model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "{'input_ids': Tensor(shape=[1, 20], dtype=Int64, value=\n",
      "[[    0,   487, 47435 ...  1257,     4,     2]]), 'attention_mask': Tensor(shape=[1, 20], dtype=Int64, value=\n",
      "[[1, 1, 1 ... 1, 1, 1]])}\n",
      "[[    2     0 22173     2]]\n",
      "['positive']\n"
     ]
    }
   ],
   "source": [
    "# Retrieve an entry from the validation dataset.\n",
    "# Alternatively, create your own text\n",
    "example = {'sentence': 'Nvidia Tops $3 Trillion in Market Value, Leapfrogging Apple.'}\n",
    "\n",
    "inputs = tokenizer(example['sentence'], return_tensors=\"ms\") # Get the tokenized text label\n",
    "print(inputs)\n",
    "\n",
    "model.set_train(False)\n",
    "with mindspore._no_grad():\n",
    "    outputs = model.generate(input_ids=inputs[\"input_ids\"], max_new_tokens=10) # Predict the text label using the trained model\n",
    "    print(outputs)\n",
    "    print(tokenizer.batch_decode(outputs.asnumpy(), skip_special_tokens=True)) # Print decoded text label from the prediction"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "MindSpore",
   "language": "python",
   "name": "mindspore"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
